* This do-file reads in data from Credit Bureau and the CPS and saves a clean collapsed version of the data for summary statistics analysis.
* Authors: Monica Essig Aberg & Ellora Derenoncourt

	* Load industry names
	insheet using "$data/cb/documentation/naics2d_names.csv", clear names
	tempfile naics2d_names
	save `naics2d_names'
	di "`naics2d_names'"

	* Load establishments and companies in sample 
	use cz cmp_company_code using "$data/cb/stacked_nonpolicy_firm_dataset.dta", clear
	duplicates drop
	tempfile insample
	save `insample', replace
	contract cmp_company_code
	drop _freq
	tempfile insample_cmp
	save `insample_cmp', replace
	
	* Load Credit Bureau data, limit to in sample establishments
	use "$data/cb/clean_nonpolicy_firm_cz.dta", clear
	merge m:1 cz cmp_company_code using `insample', gen(estab_samp)
	merge m:1 cmp_company_code using `insample_cmp', gen(firm_samp)
	keep tot_emp_7_29 tot_emp avg_wage_exact cmp_company_code mdate er_naics estab_samp firm_samp
	append using "$data/cb/clean_policy_firm_cz.dta"
	drop if cmp_company_code == 66666
	replace estab_samp = 3 if estab_samp == .
	replace firm_samp = 3 if firm_samp == .
	keep tot_emp_7_29 tot_emp avg_wage_exact cmp_company_code mdate er_naics estab_samp firm_samp // Eventually will be exact average hourly wage
	
	* Get share wage <$30
	gen eq_share_wage_lt30 = tot_emp_7_29/tot_emp if estab_samp == 3
	sum eq_share_wage_lt30 [aw = tot_emp]
	local eq_share_wage_lt30 `r(mean)'
	
	* Get average wage and establishment size
	sum avg_wage if estab_samp == 3 [aw = tot_emp_7_29]
	local avg_wage `r(mean)'
	sum tot_emp if estab_samp == 3, d
	local eq_avg_estab_size `r(mean)'
	local eq_med_estab_size `r(p50)'
	
	* Get average firm size
	preserve
		keep if firm_samp == 3
		collapse (sum) tot_emp, by(cmp mdate)
		sum tot_emp, d
		local eq_avg_firm_size `r(mean)'
		local eq_med_firm_size `r(p50)'
	restore

	* Industry shares
	keep if estab_samp == 3
	
	* Get industry category
	replace er_naics = 452 if cmp_company_code == 22222 | cmp_company_code == 44444 | cmp_company_code == 33333
	replace er_naics = 722 if cmp_company_code == 55555
	replace er_naics = 454 if cmp_company_code == 11111
	gen naics2d=substr(string(er_naics),1,2)
	replace naics2d="44-45" if naics2d=="44" | naics2d=="45"
	replace naics2d="31-33" if naics2d=="31" | naics2d=="32" | naics2d=="33" 
	replace naics2d="48-49" if naics2d=="48" | naics2d=="49"

	* Bring in industry names
	merge m:1 naics2d using `naics2d_names'
	replace naics2d = "99" if _merge==1
	replace naics2d_name = "Missing" if _merge==1
	drop _merge

	* Get industry counts
	collapse (sum) tot_emp, by(naics2d naics2d_name)
	rename tot_emp eq_indcount
	
	* Save avg wage and size info
	gen eq_avg_wage = `avg_wage'
	gen eq_avg_firm_size = `eq_avg_firm_size'
	gen eq_med_firm_size = `eq_med_firm_size'
	gen eq_avg_estab_size = `eq_avg_estab_size'
	gen eq_med_estab_size = `eq_med_estab_size'
	gen eq_share_wage_lt30 = `eq_share_wage_lt30'

	* Save
	tempfile cb
	save `cb', replace 

* Get CPS industry counts and avg wage - full sample

	* Load
	use "$data/cps/output/cpsorg_2013-2023.dta", clear
	
	* Keep employed only
	keep if emp==1

	* Crosswalk NAICS industries for 2020-2023
	preserve
		contract dind03 naics_v2, freq(count)
		drop count
		drop if naics_v2 == "."
		rename naics_v2 naics_v3
		tempfile naics_cw
		save `naics_cw', replace
	restore
	merge m:1 dind03 using `naics_cw', nogen
	tab naics_v3 year, miss
	replace naics_v2 = naics_v3
	drop naics_v3
	
	* Get share with wage < $30
	gen cps_full_share_wage_lt30 = wageotc_noadj < 30
	sum cps_full_share_wage_lt30
	local cps_full_share_wage_lt30 `r(mean)'
	
	* Get industry counts (weighted and unweighted) and mean wages
	sum wageotc_noadj if wageotc_noadj < 30
	local avg_wage `r(mean)'
	contract naics_v2, freq(cps_full_indcount)
	gen cps_full_avg_wage = `avg_wage'
	gen cps_full_share_wage_lt30 = `cps_full_share_wage_lt30'
	
	* Clean and save
	replace naics_v2 = "99" if naics_v2 == "." | naics_v2 == ""
	rename naics_v2 naics2d
	
	merge 1:1 naics2d using `cb', nogen
	order naics2d naics2d_name 
	
* Get QCEW establishment size

	preserve
	
		* Load data
		foreach year of numlist 2013(1)2023 {
			append using "$data/qcew/raw/qcew_`year'_agg", force
		}
		
		* Keep total only
		destring industry_code, replace
		keep if inlist(industry_code, 10)
		
		* Get average firm size
		drop if month1+month2+month3 == 0
		gen avg_size = (month1+month2+month3)/(3*qtrly_estabs)
		sum avg_size [aw = qtrly_estabs]
		
	restore
	
	gen qcew_avg_estab_size = `r(mean)'
	
* Get BLS median firm size from https://www.bls.gov/web/cewbd/table_g.txt

	preserve
	
		* Size class definitions 
		import excel using "$data/bls/table_g.xlsx", sheet(sizeclass) clear firstrow
		tempfile sizeclass
		save `sizeclass', replace
		
		* Firms by year by size class
		import excel using "$data/bls/table_g.xlsx", sheet(data) clear firstrow
		
		* Keep sample year
		keep if inrange(year, 2013, 2023)
		
		* Long by size class
		reshape long firms_, i(year) j(group)
		collapse (sum) firms, by(group)
		
		* Calculate shares
		egen tot = sum(firms)
		gen pct = firms/tot
		
		* Find median group (over50 == 1)
		sort group
		gen runpct = sum(pct)
		gen over50 = runpct > .5
		replace over50 = sum(over50)
		
		* Store locals
		merge 1:1 group using `sizeclass', nogen
		sum group if over50 == 1
		local min = min[`r(mean)']
		local max = max[`r(mean)']
		
	restore
	
	gen bls_med_firm_size = "`min' to `max'"
	if "`max'" == "." {
		replace bls_med_firm_size = "`min'+"
	}
	
* Calculate shares

	* Credit Bureau
	egen eq_tot = sum(eq_indcount)
	gen eq_indpct = eq_indcount/eq_tot

	* CPS
	egen cps_full_tot = sum(cps_full_indcount)
	gen cps_full_indpct = cps_full_indcount/cps_full_tot

* Prepare for graphing
	* Shorten industry names
	replace naics2d_name = "Agriculture" if naics2d_name == "Agriculture, Forestry, Fishing and Hunting"
	replace naics2d_name = "Mining" if naics2d_name == "Mining, Quarrying, and Oil and Gas Extraction"
	replace naics2d_name = "Retail" if naics2d_name == "Retail Trade"
	replace naics2d_name = "Transportation/Warehousing" if naics2d_name == "Transportation and Warehousing"
	replace naics2d_name = "Finance/Insurance" if naics2d_name == "Finance and Insurance"
	replace naics2d_name = "Real Estate" if naics2d_name == "Real Estate and Rental and Leasing"
	replace naics2d_name = "Prof/Sci/Tech Services" if naics2d_name == "Professional, Scientific, and Technical Services"
	replace naics2d_name = "Management" if naics2d_name == "Management of Companies and Enterprises"
	replace naics2d_name = "Administrative/Support Services" if naics2d_name == "Administrative and Support and Waste Management and Remediation Services"
	replace naics2d_name = "Education" if naics2d_name == "Educational Services"
	replace naics2d_name = "Health" if naics2d_name == "Health Care and Social Assistance"
	replace naics2d_name = "Arts" if naics2d_name == "Arts, Entertainment, and Recreation"
	replace naics2d_name = "Accommodation/Food Service" if naics2d_name == "Accommodation and Food Services"
	replace naics2d_name = "Other Services" if naics2d_name == "Other Services (except Public Administration)"
	replace naics2d_name = "Public Admin" if naics2d_name == "Public Administration (not covered in economic census)"
	replace naics2d_name = "Unknown" if naics2d_name == "Missing"

	* Scale percent
	foreach var of varlist *pct {
		replace `var' = `var'*100
	}
	
	* Create ID for graphing
	gsort eq_indpct
	cap drop id*
	gen id = _n
	gen id2 = id-.2
	gen id3 = id2+.4
	
	* Label ID
	gsort eq_indpct
	global labels
	foreach num of numlist 1(1)21 {
		local name = naics2d_name[`num']
		global labels $labels `num' `"`name'"' 
	}

	save "$data/cb/clean_summary_statistics_credit_bureau_cps.dta", replace
	